<?php
include_once 'common_mbs.inc.php';
include_once 'smarty.inc.php';
  header("Content-type:text/html;charset=utf-8");
  set_time_limit(0);  

  //采集URL
  $page=(int)$_GET['page']?(int)$_GET['page']:"430";
  if($page >= 0)
  {
	   $url="http://www.lvping.com/journals.aspx?type=3&dname=&title=&tag=0&tagn=&author=&group=12&orderby=&pageno=".$page;
  }
   else
  {
	    echo "采集结束";
		exit;
  }
 
  $rand = rand(1,100);
   //替换文章中的图片保存路径
  $base = "E:/wamp/www/qy/upload/travel/{$rand}/";
   //数据库保存的路径
  $img_save_path = "../upload/travel/{$rand}/";
 
  //打开URL
  $url=@file_get_contents($url);  
   //唯一开始标识
  $start_tag = '<div class="forumDetail" id="journal-items-id">';
  //唯一结束标识
  $end_tag = '<div class="new-paging">';
  
   //配置正则
  $reg = "/{$start_tag}(.*){$end_tag}/iUs"; 
  
  //取得开始和结束之间的内容
  preg_match_all ($reg, $url, $macthes);

  //url中必须包含的词
  $mast_have_keyword = "showjournal";
  
  //当页文章总数
  $artile_count = 0;
  preg_match_all("/<a  target=\"_blank\"  href=\"(.*){$mast_have_keyword}(.*)(html){1}\">/iUs", $macthes[0][0], $matches_urls);
  
  //基本路径
  $base_url = 'http://www.lvping.com';
  foreach($matches_urls[0] as $url_info)
  {
	   preg_match_all("/href=\"(.*)\"/iUs", $url_info, $url_data);
	   $data[]=$base_url.$url_data[1][0];
	   $artile_count++;
  }

  $id=(int)$_GET['id']?(int)$_GET['id']:"0";
  if($id >= $artile_count )
  {
	  echo "采集结束";
	  $page--;
	  echo "<script>location.href='http://127.0.0.1/qy/chaiji_1.php?id=0&page=".$page."'</script>";
	  exit;
  }
  $file_url = $data[$id];
  $fp=@file_get_contents($file_url); 
  $fp=iconv("GBK","utf-8",$fp);

   //内容的标题唯一开始标记
   $content_title_start_tag = "<h1 id=\"journal-title\" class=\"journal-title\">";
   //内容的标题唯一结束标记
   $content_title_end_tag = "<\/h1>";
  //取得文章的标题
  preg_match_all("/{$content_title_start_tag}(.*){$content_title_end_tag}/iUs", $fp, $title);
  $sql_title = $title[1][0];
 
  //内容的唯一开始标记
  $content_start_tag ='<div class=\"journal-content\" id=\"journal-content\">';
  //内容的标题结束标记
  $content_end_tag = '<a id=\"thanksMes\" name=\"thanksMes\">';
  //取文章的内容
  preg_match_all("/{$content_start_tag}(.*){$content_end_tag}/iUs", $fp, $content);
 
  //去掉文章的标签
  $tag_list = array(
	  1=>"<div(.*)>",
	  2=>"<\/div>",
	  3=>"<p(.*)>",
	  4=>"<\/p>",
	  5=>"<span(.*)>",
	  6=>"<\/span>",
	  7=>"<a(.*)>",
	  8=>"<\/a>",
	  9=>"<font(.*)>",
	  10=>"<\/font>",
	  11=>"<b(.*)>",
	  12=>"<\/b>",
	  13=>"<table(.*)>(.*)<\/table>",
  );
  
  foreach($tag_list as $tag)
  {	 
	 $new_str = preg_replace("/{$tag}/iUs", '', $content[1][0]);
	 $content[1][0] = $new_str;
  }
 
  //获得图片数组
  
  $pattern_src ='/<img(.*)(\/)?>/iUs';
  $num = preg_match_all($pattern_src, $content[1][0], $imgsrcs);
  $imgsrcs=$imgsrcs[0];
  foreach($imgsrcs as $img_url)
  {
	  $img_addres = time().rand(1,100).'.jpg';
	  preg_match_all("/src=\"(.*)\"/iUs", $img_url, $src);
	  $old_img_addres = substr($src[1][0],strripos($src[1][0], '/')+1);
	
		ob_start();
		readfile($src[1][0]);
		$img = ob_get_contents();
		ob_end_clean();
		$size = strlen($img);
		$fp2 = fopen($base.$img_addres , "w");
		fwrite($fp2, $img);
		fclose($fp2);
	    $new_str = preg_replace("/src=\"(((?!\").)*){$old_img_addres}\"/iUs", "src=\"{$img_save_path}{$img_addres}\"", $content[1][0]);
	 
		$content[1][0] = $new_str;
  }
   $sql_content = $content[1][0];
   $new_obj = new news_cls();
   $insert_data = array(
	   'news_title'=>trim($sql_title),
	   'news_content'=>$sql_content,
	   'news_addtime'=>time(),
	   'news_adduser'=>$_ADMIN_NAME,
	   'page_title'=>trim($sql_title).'_港澳旅游攻略_启程旅游网',
	    'page_des'=>trim(substr($sql_content, 0, 100)),
	   'page_keyword'=>'港澳旅游攻略_启程旅游网',
	   'news_type'=>23
   );
   $new_obj->add_news($insert_data);

  if($id< $artile_count){
	$id++;
	echo "正在采集......".$id;
	echo "<script>location.href='http://127.0.0.1/qy/chaiji_1.php?id=".$id."&page=".$page."'</script>";
  }else{
	echo "采集结束";
  }
?>